library("readr")
library("igraph")
library("dplyr")
library("stringr")
library("scales")
library("textreuse")
source("R/helper.R")
source("R/section-matches.R")
Read the data.
load("cache/corpus-lsh.rda")
source("R/spectrogram.R")
Spectrogram of borrowings in CA1850. I expect it will show heavily reliance on NY1849 amidst original material.
spectrogram("CA1850", best_matches, white_list = 7)
## Loading required package: ggplot2
Spectrogram of borrowings in MD1855. I expect to see reliance on English legislation (GB1852 and GB1854), amidst original material.
spectrogram("MD1855", best_matches, white_list = 7)
Spectrogram of borrowings in WA1855. An earlier version show two long runs of borrowings from Oregon and Indiana. If the latest data, still visualizes that kind of splicing, I think it’s a great illustration.
spectrogram("WA1855", best_matches, white_list = 8)
The point of this and the next exercise is not to show borrowings from all sources, but similarity between two sources. Our normal spectrogram of NV1861 will show heavy reliance on California. I want this to show that even though Nevada is two steps away from New York, it’s text is still very similar to New York throughout.
all_NV1861 <- data_frame(borrower_code = "NV1861",
borrower_section = all_matches %>%
filter(borrower_code == "NV1861") %>%
`$`("borrower_section") %>%
unique() %>%
sort()
)
matches_to_NY <- all_matches %>%
filter(borrower_code == "NV1861",
str_detect(match_code, "NY"),
score >= 0.1,
match_year <= 1861) %>%
group_by(borrower_section) %>%
arrange(desc(score)) %>%
slice(1) %>%
ungroup()
NV1861toNY <- all_NV1861 %>% left_join(matches_to_NY) %>%
arrange(borrower_section)
## Joining by: c("borrower_code", "borrower_section")
spectrogram("NV1861", NV1861toNY, white_list = 5,
title = "Sections in NV1861 with high similarity to NY codes")
Like the previous query for NV1861, Iowa is two steps away from New York. This time, however, the text is much less similar to New York. I only want one visualization for this, but I’m not sure if IA1851 or the revision IA1859 will be better, so can we do both?
IA1851
all_IA1851 <- data_frame(borrower_code = "IA1851",
borrower_section = all_matches %>%
filter(borrower_code == "IA1851") %>%
`$`("borrower_section") %>%
unique() %>%
sort()
)
matches_to_NY <- all_matches %>%
filter(borrower_code == "IA1851",
str_detect(match_code, "NY"),
score >= 0.1,
match_year <= 1851) %>%
group_by(borrower_section) %>%
arrange(desc(score)) %>%
slice(1) %>%
ungroup()
IA1851toNY <- all_IA1851 %>% left_join(matches_to_NY) %>%
arrange(borrower_section)
## Joining by: c("borrower_code", "borrower_section")
spectrogram("IA1851", IA1851toNY, white_list = 10,
title = "Sections in IA1851 with matches to NY codes")
IA1859
all_IA1859 <- data_frame(borrower_code = "IA1859",
borrower_section = all_matches %>%
filter(borrower_code == "IA1859") %>%
`$`("borrower_section") %>%
unique() %>%
sort()
)
matches_to_NY <- all_matches %>%
filter(borrower_code == "IA1859",
str_detect(match_code, "NY"),
score >= 0.1,
match_year <= 1859) %>%
group_by(borrower_section) %>%
arrange(desc(score)) %>%
slice(1) %>%
ungroup()
IA1859toNY <- all_IA1859 %>% left_join(matches_to_NY) %>%
arrange(borrower_section)
## Joining by: c("borrower_code", "borrower_section")
spectrogram("IA1859", IA1859toNY, white_list = 10,
title = "Sections in IA1859 with matches to NY codes")
Spectrogram of borrowings in NC1868. Should show heavy reliance on one or another New York Code, with scattered provisions coming from elsewhere.
spectrogram("NC1868", best_matches, white_list = 8)
Spectrogram of borrowings in CO1868. Should show heavy reliance on Illinois law.
spectrogram("CO1868", best_matches, white_list = 8)
Same query as number 6 and 7, but like Louisiana, the point is to show that no one borrows the later New York Code, the one that departs from the Field tradition.
spectrogram("NY1876", best_matches, white_list = 8)
spectrogram("NY1879", best_matches, white_list = 8)
Here I’m looking for the reverse of the previous spectrograms. Rather than illustrate sections BORROWED FROM elsewhere, can we illustrate sections in the original Field Code that GET BORROWED elsewhere, and instead of coloring by where they end up, color based on how many times a particular section appears throughout the corpus?
I expect the beginning of the code will be grey–no one has New York’s particular court system, so no one copies the jurisdictional texts, then high heat at the heart of the Code with the Field reforms, slackening off around particular proceedings, some of which will be New York specific, then heating up again towards the end with the code of evidence.
Create a network graph based on section percentages.
edges_pct <- summary_matches %>%
filter(percent_borrowed >= 0.05,
!is.na(match_code)) %>%
select(borrower_code, match_code, weight = percent_borrowed) %>%
group_by(borrower_code) %>%
top_n(2, weight)
edges_pct
## Source: local data frame [136 x 3]
## Groups: borrower_code [83]
##
## borrower_code match_code weight
## (chr) (chr) (dbl)
## 1 AK1900 OR1862 0.5937
## 2 AR1868 KY1851 0.3634
## 3 AR1868 KY1854 0.3158
## 4 AR1874 AR1868 0.6752
## 5 AR1874 KY1851 0.0818
## 6 AZ1865 CA1851 0.5476
## 7 AZ1865 CA1858 0.2590
## 8 AZ1887 CA1872 0.4313
## 9 CA1850 NY1849 0.2972
## 10 CA1850 NY1850 0.1207
## .. ... ... ...
g <- graph_from_data_frame(edges_pct, directed = TRUE)
nodes <- distances(g, to = "NY1850", algorithm = "unweighted") %>% as.data.frame() %>%
add_rownames() %>%
rename(name = rowname, distance = NY1850) %>%
mutate(color = ifelse(distance == 0, "red",
ifelse(distance == 1, "green",
ifelse(distance == 2, "yellow", "lightblue"))))
## Warning in distances(g, to = "NY1850", algorithm = "unweighted"):
## Unweighted algorithm chosen, weights ignored
nodes[nodes$name == "NY1848", "color"] <- "red"
nodes[nodes$name == "NY1849", "color"] <- "red"
nodes[nodes$name == "NY1850", "color"] <- "red"
nodes[nodes$name == "NY1851", "color"] <- "red"
g <- graph_from_data_frame(edges_pct, directed = TRUE, vertices = nodes)
V(g)$year <- V(g)$name %>% extract_date()
set.seed(4221)
g <- add_layout_(g, with_graphopt(niter = 4000, spring.length = 25), normalize())
plot_before_year <- function(x, year) {
x_before <- induced.subgraph(x, which(V(x)$year <= year))
n <- V(x)$name
n_before <- V(x_before)$name
filter <- n %in% n_before
x_before$layout <- x_before$layout[filter, ]
par(mar = c(0,0,1,0))
plot(x_before, edge.width = E(x_before)$weight * 8,
edge.arrow.size = 0.0, vertex.size = 5)
title(paste0("Codes of Civil Procedure before ", year))
}
for (i in seq(1850, 1900, 5)) {
plot_before_year(g, i)
}
Create a graph based on numbers (not percentages) of sections shared. Notice that we are keeping only code to code matches that share a certain number of sections (minimum_n), we are keeping only a certain number of matches for each code (top_matches), and we are omitting codes that aren’t part of the main network.
minimum_n <- 20
top_matches <- 2
codes_not_to_plot <- c(
# "CO868",
# "CT1879",
# "FL1847",
# "FL1892",
"GA1851",
"GA1860",
"HI1859",
"HI1897",
# "IL1866",
"LA1825",
"LA1844"
# "MS1848",
# "MS1857",
# "NY1876",
# "NY1879",
# "VA1860",
# "VA1893",
# "WV1868"
)
# codes_not_to_plot <- NULL
edges_n <- summary_matches %>%
filter(!is.na(match_code),
sections_borrowed >= minimum_n) %>%
select(borrower_code, match_code, weight = sections_borrowed) %>%
group_by(borrower_code) %>%
top_n(top_matches, weight) %>%
ungroup() %>%
mutate(weight = rescale(weight)) %>%
filter(!borrower_code %in% codes_not_to_plot,
!match_code %in% codes_not_to_plot)
edges_n
## Source: local data frame [146 x 3]
##
## borrower_code match_code weight
## (chr) (chr) (dbl)
## 1 AK1900 OR1862 0.398192771
## 2 AK1900 NY1850 0.002409639
## 3 AK1900 OR1854 0.002409639
## 4 AR1868 KY1851 0.209036145
## 5 AR1868 KY1854 0.180120482
## 6 AR1874 AR1868 0.146987952
## 7 AR1874 KY1851 0.007228916
## 8 AZ1865 CA1851 0.199397590
## 9 AZ1865 CA1858 0.087951807
## 10 AZ1887 CA1872 0.203614458
## .. ... ... ...
g_n <- graph_from_data_frame(edges_n, directed = TRUE)
node_distances <- distances(g_n, to = c("NY1848", "NY1849", "NY1850", "NY1851"),
algorithm = "unweighted") %>%
apply(1, min, na.rm = TRUE)
## Warning in distances(g_n, to = c("NY1848", "NY1849", "NY1850", "NY1851"), :
## Unweighted algorithm chosen, weights ignored
nodes_n <- data_frame(name = names(node_distances), distance = node_distances) %>%
mutate(color = ifelse(distance == 0, "red",
ifelse(distance == 1, "green",
ifelse(distance == 2, "yellow", "lightblue"))))
g_n <- graph_from_data_frame(edges_n, directed = TRUE, vertices = nodes_n)
V(g_n)$year <- V(g_n)$name %>% extract_date()
edge_size_clamp <- function(g, multiplier = 20, max_val = 6, min_val = 1) {
w <- E(g)$weight * multiplier
w[w > max_val] <- max_val
w[w < min_val] <- min_val
w
}
set.seed(4221)
g_n <- g_n %>% add_layout_(with_graphopt(niter = 4000, spring.length = 25),
normalize())
par(mar = c(0,0,1,0))
plot(g_n, edge.width = edge_size_clamp(g_n), edge.arrow.size = 0, vertex.size = 5)
title("Borrowings between codes, number of sections borrowed")
Now do a state to state network:
min_state_borrowings <- 100
top_matches <- 2
edges_states <- summary_matches %>%
mutate(borrower_date = extract_date(borrower_code),
match_date = extract_date(match_code),
borrower_state = extract_state(borrower_code),
match_state = extract_state(match_code)) %>%
filter(!is.na(match_code),
borrower_date >= match_date,
borrower_state != match_state) %>%
group_by(borrower_state, match_state) %>%
summarize(n = sum(sections_borrowed)) %>%
filter(n >= min_state_borrowings) %>%
select(borrower_state, match_state, weight = n) %>%
group_by(borrower_state) %>%
top_n(top_matches, weight) %>%
ungroup() %>%
mutate(weight = rescale(weight))
edges_states
## Source: local data frame [44 x 3]
##
## borrower_state match_state weight
## (chr) (chr) (dbl)
## 1 AK OR 0.57655039
## 2 AR KY 0.60562016
## 3 AZ CA 0.75484496
## 4 CA NY 0.69670543
## 5 CO CA 0.05910853
## 6 CO IL 0.04166667
## 7 DC IN 0.11821705
## 8 DT ND 0.22674419
## 9 DT NE 0.19476744
## 10 FL NY 0.03488372
## .. ... ... ...
g_states <- graph_from_data_frame(edges_states, directed = TRUE)
state_distances <- distances(g_states, to = "NY", algorithm = "unweighted")
## Warning in distances(g_states, to = "NY", algorithm = "unweighted"):
## Unweighted algorithm chosen, weights ignored
nodes_states <- data_frame(name = rownames(state_distances),
distance = state_distances[, 1]) %>%
mutate(color = ifelse(distance == 0, "red",
ifelse(distance == 1, "green",
ifelse(distance == 2, "yellow", "lightblue"))))
g_states <- graph_from_data_frame(edges_states, directed = TRUE,
vertices = nodes_states) %>%
decompose(min.vertices = 3) %>%
`[[`(1)
set.seed(4221)
g_states <- g_states %>% add_layout_(with_graphopt(niter = 4000,
spring.length = 25),
normalize())
par(mar = c(0,0,1,0))
plot(g_states,
edge.width = edge_size_clamp(g_n), edge.arrow.size = 0.5,
edge.arrow.mode = 1,
vertex.size = 5, vertex.label.dist = 0.85, vertex.label.degree = pi)
title("Borrowings between states, number of sections borrowed")